# Numeric Univariate Analysis
# Importing the Data
```r
#install.packages(c("FactoMineR", "factoextra"))
library("FactoMineR")
library("factoextra")
data(decathlon2)
head(decathlon2)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## ZSIVOCZKY 11.13 7.30 13.48 2.01 48.62 14.17 45.67
## McMULLEN 10.83 7.31 13.76 2.13 49.91 14.38 44.41
## Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE 5.02 63.19 291.7 1 8217 Decastar
## CLAY 4.92 60.15 301.5 2 8122 Decastar
## BERNARD 5.32 62.77 280.1 4 8067 Decastar
## YURKOV 4.72 63.44 276.4 5 8036 Decastar
## ZSIVOCZKY 4.42 55.37 268.0 7 8004 Decastar
## McMULLEN 4.42 56.37 285.1 8 7995 Decastar
– Number of rows, Columns
– Variables - type, Values
library(tibble)
glimpse(decathlon2)
## Rows: 27
## Columns: 13
## $ X100m <dbl> 11.04, 10.76, 11.02, 11.34, 11.13, 10.83, 11.64, 11.37, 1…
## $ Long.jump <dbl> 7.58, 7.40, 7.23, 7.09, 7.30, 7.31, 6.81, 7.56, 6.97, 7.2…
## $ Shot.put <dbl> 14.83, 14.26, 14.25, 15.19, 13.48, 13.76, 14.57, 14.41, 1…
## $ High.jump <dbl> 2.07, 1.86, 1.92, 2.10, 2.01, 2.13, 1.95, 1.86, 1.95, 1.9…
## $ X400m <dbl> 49.81, 49.37, 48.93, 50.42, 48.62, 49.91, 50.14, 51.10, 4…
## $ X110m.hurdle <dbl> 14.69, 14.05, 14.99, 15.31, 14.17, 14.38, 14.93, 15.06, 1…
## $ Discus <dbl> 43.75, 50.72, 40.87, 46.26, 45.67, 44.41, 47.60, 44.99, 4…
## $ Pole.vault <dbl> 5.02, 4.92, 5.32, 4.72, 4.42, 4.42, 4.92, 4.82, 4.72, 4.6…
## $ Javeline <dbl> 63.19, 60.15, 62.77, 63.44, 55.37, 56.37, 52.33, 57.19, 5…
## $ X1500m <dbl> 291.70, 301.50, 280.10, 276.40, 268.00, 285.10, 262.10, 2…
## $ Rank <int> 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7,…
## $ Points <int> 8217, 8122, 8067, 8036, 8004, 7995, 7802, 7733, 7708, 765…
## $ Competition <fct> Decastar, Decastar, Decastar, Decastar, Decastar, Decasta…
sample(decathlon2)
## X110m.hurdle High.jump Javeline Shot.put Discus Points Long.jump
## SEBRLE 14.69 2.07 63.19 14.83 43.75 8217 7.58
## CLAY 14.05 1.86 60.15 14.26 50.72 8122 7.40
## BERNARD 14.99 1.92 62.77 14.25 40.87 8067 7.23
## YURKOV 15.31 2.10 63.44 15.19 46.26 8036 7.09
## ZSIVOCZKY 14.17 2.01 55.37 13.48 45.67 8004 7.30
## McMULLEN 14.38 2.13 56.37 13.76 44.41 7995 7.31
## MARTINEAU 14.93 1.95 52.33 14.57 47.60 7802 6.81
## HERNU 15.06 1.86 57.19 14.41 44.99 7733 7.56
## BARRAS 14.48 1.95 55.40 14.09 42.10 7708 6.97
## NOOL 15.29 1.98 57.44 12.68 37.92 7651 7.27
## BOURGUIGNON 15.67 1.86 54.68 13.46 40.49 7313 6.80
## Sebrle 14.05 2.12 70.52 16.36 48.72 8893 7.84
## Clay 14.13 2.06 69.71 15.23 50.11 8820 7.96
## Karpov 13.97 2.09 55.54 15.93 51.65 8725 7.81
## Macey 14.56 2.15 58.46 15.73 48.34 8414 7.47
## Warners 14.01 1.97 55.39 14.48 43.73 8343 7.74
## Zsivoczky 14.95 2.12 63.45 15.31 45.62 8287 7.14
## Hernu 14.25 2.03 57.76 14.65 44.72 8237 7.19
## Bernard 14.17 2.12 55.27 14.80 44.75 8225 7.48
## Schwarzl 14.25 1.94 56.32 14.01 42.43 8102 7.49
## Pogorelov 14.21 2.06 53.45 15.10 44.60 8084 7.31
## Schoenbeck 14.34 1.88 60.89 14.77 44.41 8077 7.30
## Barras 14.37 1.94 64.55 14.91 44.83 8067 6.99
## KARPOV 14.09 2.04 50.31 14.77 48.95 8099 7.30
## WARNERS 14.23 1.98 51.77 14.31 41.10 8030 7.60
## Nool 14.80 1.88 61.33 14.26 42.05 8235 7.53
## Drews 14.01 1.88 51.53 13.07 40.11 7926 7.38
## Rank X400m Pole.vault Competition X100m X1500m
## SEBRLE 1 49.81 5.02 Decastar 11.04 291.70
## CLAY 2 49.37 4.92 Decastar 10.76 301.50
## BERNARD 4 48.93 5.32 Decastar 11.02 280.10
## YURKOV 5 50.42 4.72 Decastar 11.34 276.40
## ZSIVOCZKY 7 48.62 4.42 Decastar 11.13 268.00
## McMULLEN 8 49.91 4.42 Decastar 10.83 285.10
## MARTINEAU 9 50.14 4.92 Decastar 11.64 262.10
## HERNU 10 51.10 4.82 Decastar 11.37 285.10
## BARRAS 11 49.48 4.72 Decastar 11.33 282.00
## NOOL 12 49.20 4.62 Decastar 11.33 266.60
## BOURGUIGNON 13 51.16 5.02 Decastar 11.36 291.70
## Sebrle 1 48.36 5.00 OlympicG 10.85 280.01
## Clay 2 49.19 4.90 OlympicG 10.44 282.00
## Karpov 3 46.81 4.60 OlympicG 10.50 278.11
## Macey 4 48.97 4.40 OlympicG 10.89 265.42
## Warners 5 47.97 4.90 OlympicG 10.62 278.05
## Zsivoczky 6 49.40 4.70 OlympicG 10.91 269.54
## Hernu 7 48.73 4.80 OlympicG 10.97 264.35
## Bernard 9 49.13 4.40 OlympicG 10.69 276.31
## Schwarzl 10 49.76 5.10 OlympicG 10.98 273.56
## Pogorelov 11 50.79 5.00 OlympicG 10.95 287.63
## Schoenbeck 12 50.30 5.00 OlympicG 10.90 278.82
## Barras 13 49.41 4.60 OlympicG 11.14 267.09
## KARPOV 3 48.37 4.92 Decastar 11.02 300.20
## WARNERS 6 48.68 4.92 Decastar 11.11 278.10
## Nool 8 48.81 5.40 OlympicG 10.80 276.33
## Drews 19 48.51 5.00 OlympicG 10.87 274.21
summary(decathlon2)
## X100m Long.jump Shot.put High.jump
## Min. :10.44 Min. :6.800 Min. :12.68 Min. :1.860
## 1st Qu.:10.84 1st Qu.:7.210 1st Qu.:14.17 1st Qu.:1.930
## Median :10.97 Median :7.310 Median :14.57 Median :1.980
## Mean :10.99 Mean :7.365 Mean :14.54 Mean :1.998
## 3rd Qu.:11.13 3rd Qu.:7.545 3rd Qu.:15.01 3rd Qu.:2.080
## Max. :11.64 Max. :7.960 Max. :16.36 Max. :2.150
## X400m X110m.hurdle Discus Pole.vault
## Min. :46.81 Min. :13.97 Min. :37.92 Min. :4.400
## 1st Qu.:48.70 1st Qu.:14.15 1st Qu.:42.27 1st Qu.:4.660
## Median :49.20 Median :14.34 Median :44.72 Median :4.900
## Mean :49.31 Mean :14.50 Mean :44.85 Mean :4.836
## 3rd Qu.:49.86 3rd Qu.:14.87 3rd Qu.:46.93 3rd Qu.:5.000
## Max. :51.16 Max. :15.67 Max. :51.65 Max. :5.400
## Javeline X1500m Rank Points Competition
## Min. :50.31 Min. :262.1 Min. : 1.000 Min. :7313 Decastar:13
## 1st Qu.:55.32 1st Qu.:271.6 1st Qu.: 4.000 1st Qu.:8000 OlympicG:14
## Median :57.19 Median :278.1 Median : 7.000 Median :8084
## Mean :58.32 Mean :278.5 Mean : 7.444 Mean :8119
## 3rd Qu.:62.05 3rd Qu.:283.6 3rd Qu.:10.500 3rd Qu.:8236
## Max. :70.52 Max. :301.5 Max. :19.000 Max. :8893
#https://stackoverflow.com/questions/50848273/call-many-variables-in-a-for-loop-with-dplyr-ggplot-function
plotUniCat <- function(df, x) {
x <- sym(x)
df %>%
filter(!is.na(!!x)) %>%
count(!!x) %>%
mutate(prop = prop.table(n)) %>%
ggplot(aes(y=prop, x=!!x)) +
geom_bar(stat = "identity")
}
colnames(decathlon2)
## [1] "X100m" "Long.jump" "Shot.put" "High.jump" "X400m"
## [6] "X110m.hurdle" "Discus" "Pole.vault" "Javeline" "X1500m"
## [11] "Rank" "Points" "Competition"
str(decathlon2)
## 'data.frame': 27 obs. of 13 variables:
## $ X100m : num 11 10.8 11 11.3 11.1 ...
## $ Long.jump : num 7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
## $ Shot.put : num 14.8 14.3 14.2 15.2 13.5 ...
## $ High.jump : num 2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
## $ X400m : num 49.8 49.4 48.9 50.4 48.6 ...
## $ X110m.hurdle: num 14.7 14.1 15 15.3 14.2 ...
## $ Discus : num 43.8 50.7 40.9 46.3 45.7 ...
## $ Pole.vault : num 5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
## $ Javeline : num 63.2 60.1 62.8 63.4 55.4 ...
## $ X1500m : num 292 302 280 276 268 ...
## $ Rank : int 1 2 4 5 7 8 9 10 11 12 ...
## $ Points : int 8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
## $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
library(dplyr)
data_num <- decathlon2 %>% select_if(is.numeric)
str(data_num)
## 'data.frame': 27 obs. of 12 variables:
## $ X100m : num 11 10.8 11 11.3 11.1 ...
## $ Long.jump : num 7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
## $ Shot.put : num 14.8 14.3 14.2 15.2 13.5 ...
## $ High.jump : num 2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
## $ X400m : num 49.8 49.4 48.9 50.4 48.6 ...
## $ X110m.hurdle: num 14.7 14.1 15 15.3 14.2 ...
## $ Discus : num 43.8 50.7 40.9 46.3 45.7 ...
## $ Pole.vault : num 5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
## $ Javeline : num 63.2 60.1 62.8 63.4 55.4 ...
## $ X1500m : num 292 302 280 276 268 ...
## $ Rank : int 1 2 4 5 7 8 9 10 11 12 ...
## $ Points : int 8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
variables <- colnames(data_num)
out <- lapply(variables, function(i) plotUniCat(decathlon2,i))
#https://stackoverflow.com/questions/17963962/plot-size-and-resolution-with-r-markdown-knitr-pandoc-beamer
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i],xlab = (i))}
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i), ylim = c(0,20),ylab = "frequency")}
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i))}
par(mfrow=c(4, 3))
for (i in names(data_num)){
plot(density(data_num[, i]), main = paste0(i), xlab= paste0(i))
}
library(psych)
pairs.panels(data_num, col="red")
#methods(class = class(decathlon2[,'Competition']))
methods(class = 'factor')
## [1] [ [[ [[<- [<- all.equal
## [6] as.character as.data.frame as.Date as.list as.logical
## [11] as.POSIXlt as.vector c coerce droplevels
## [16] format initialize is.na<- length<- levels<-
## [21] Math Ops plot print recode
## [26] relevel relist rep scale_type show
## [31] slotsFromS3 summary Summary type_sum xtfrm
## see '?methods' for accessing help and source code
levels(decathlon2[,'Competition'])
## [1] "Decastar" "OlympicG"
nlevels(decathlon2[,'Competition'])
## [1] 2
summary(decathlon2[,'Competition'])
## Decastar OlympicG
## 13 14
#Correlation Matrix with GGally
library(GGally)
# Check correlations (as scatterplots), distribution and print corrleation coefficient
ggpairs(data_num, title="correlogram with ggpairs()")
library(GGally)
# Nice visualization of correlations
ggcorr(data_num, method = c("everything", "pearson"))
# https://www.r-graph-gallery.com/199-correlation-matrix-with-ggally.html
# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables
library(GGally)
# From the help page:
data(flea)
head(flea)
## species tars1 tars2 head aede1 aede2 aede3
## 1 Concinna 191 131 53 150 15 104
## 2 Concinna 185 134 50 147 13 105
## 3 Concinna 200 137 52 144 14 102
## 4 Concinna 173 127 50 144 16 97
## 5 Concinna 171 118 49 153 13 106
## 6 Concinna 160 118 47 140 15 99
ggpairs(flea, columns = 2:4, ggplot2::aes(colour=species))
ggpairs(decathlon2, columns = 1:12, ggplot2::aes(colour=Competition))